set.seed(895893)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
library(broom)
library(papaja)
## Loading required package: tinylabels
This document outlines using the suggested procedure to simulate the number of participants necessary to accurately measure items. There are two key issues these ideas should address that we know about power:
Population. The data was simulated using the
rnorm function assuming a normal distribution for 30 scale
type items. Each population was simulated with 1000 data points. No
items were rounded for this simulation.
First, the scale of the data was manipulated by creating three sets of scales. The first scale was mimicked after small rating scales (i.e., 1-7 type style) using a \(\mu\) = 4 with a \(\sigma\) = .25 around the mean to create item mean variability. The second scale included a larger potential distribution of scores with a \(\mu\) = 50 (\(\sigma\) = 10) imitating a 0-100 scale. Last, the final scale included a \(\mu\) = 1000 (\(\sigma\) = 150) simulating a study that may include response latency data in the milliseconds. While there are many potential scales, these three represent a large number of potential variables in the social sciences. As we are suggesting variances as a key factor for estimating sample sizes, the scale of the data is influential on the amount of potential variance. Smaller ranges of data (1-7) cannot necessarily have the same variance as larger ranges (0-100).
Next, item variance heterogeneity was included by manipulating the potential \(\sigma\) for each individual item. For small scales, the \(\sigma\) = 2 points with a variability of .2, .4, and .8 for low, medium, and high heterogeneity in the variances between items. For the medium scale of data, \(\sigma\) = 20 with a variance of 4, 8, and 12. Last, for the large scale of data, \(\sigma\) = 350 with a variance of 100, 150, and 200 for heterogeneity.
# small potential variability overall, sort of 1-7ish scale
mu1 <- rnorm(30, 4, .25)
sigma1.1 <- rnorm(30, 2, .2)
sigma2.1 <- rnorm(30, 2, .4)
sigma3.1 <- rnorm(30, 2, .8)
# medium potential variability 0 to 100 scale
mu2 <- rnorm(30, 50, 10)
sigma1.2 <- rnorm(30, 20, 4)
sigma2.2 <- rnorm(30, 20, 8)
sigma3.2 <- rnorm(30, 20, 12)
# large potential variability in the 1000s scale
mu3 <- rnorm(30, 1000, 150)
sigma1.3 <- rnorm(30, 350, 100)
sigma2.3 <- rnorm(30, 350, 150)
sigma3.3 <- rnorm(30, 350, 200)
while(sum(sigma3.3 < 0) > 0){
sigma3.3 <- rnorm(30, 350, 200)
}
population1 <- data.frame(
item = rep(1:30, 1000*3),
scale = rep(1:3, each = 1000*30),
score = c(rnorm(1000*30, mean = mu1, sd = sigma1.1),
rnorm(1000*30, mean = mu2, sd = sigma1.2),
rnorm(1000*30, mean = mu3, sd = sigma1.3))
)
population2 <- data.frame(
item = rep(1:30, 1000*3),
scale = rep(1:3, each = 1000*30),
score = c(rnorm(1000*30, mean = mu1, sd = sigma2.1),
rnorm(1000*30, mean = mu2, sd = sigma2.2),
rnorm(1000*30, mean = mu3, sd = sigma2.3))
)
population3 <- data.frame(
item = rep(1:30, 1000*3),
scale = rep(1:3, each = 1000*30),
score = c(rnorm(1000*30, mean = mu1, sd = sigma3.1),
rnorm(1000*30, mean = mu2, sd = sigma3.2),
rnorm(1000*30, mean = mu3, sd = sigma3.3))
)
#evidence that they are simulated correctly
tapply(population1$score, list(population1$item,
population1$scale), mean)
## 1 2 3
## 1 3.902902 30.69299 1060.1286
## 2 3.641517 46.63335 1011.1333
## 3 3.812100 42.54887 863.2523
## 4 3.854610 58.08905 989.1498
## 5 3.433522 41.86002 825.0407
## 6 4.143272 70.85346 950.5212
## 7 4.107761 54.18749 1126.6508
## 8 4.434764 37.49411 1012.9551
## 9 3.619174 40.15325 1168.6727
## 10 4.006974 36.88427 1061.4187
## 11 3.267784 49.45770 976.9003
## 12 3.813818 45.31035 1046.3892
## 13 4.269054 55.89109 1119.6837
## 14 3.960559 42.17977 900.3035
## 15 3.960783 56.66281 1158.3500
## 16 3.839746 32.36607 1201.1798
## 17 3.966508 69.96384 831.2347
## 18 4.079169 40.01837 1175.0270
## 19 4.357796 51.04958 914.0601
## 20 3.961464 44.83878 917.2872
## 21 3.965871 41.84450 1029.9013
## 22 4.347589 51.13396 1086.5989
## 23 4.045330 38.57286 1171.8549
## 24 4.239570 52.47608 955.2740
## 25 3.721038 58.97741 1066.0189
## 26 3.858812 50.52820 923.1816
## 27 4.219286 48.65683 831.5085
## 28 3.862671 50.88695 1087.1130
## 29 3.704449 62.08790 1188.7390
## 30 4.492225 58.18327 872.2492
tapply(population1$score, list(population1$item,
population1$scale), sd)
## 1 2 3
## 1 1.890627 21.46041 207.6584
## 2 2.058239 13.35917 388.9667
## 3 1.948857 21.17810 553.8019
## 4 1.991223 19.29285 414.9242
## 5 1.969047 17.21137 254.4290
## 6 1.883817 14.83067 459.1360
## 7 1.833318 27.00016 391.7086
## 8 2.122209 16.46313 332.1108
## 9 2.306645 18.89536 351.2334
## 10 1.754506 19.68230 484.4561
## 11 1.587011 19.31657 510.5722
## 12 1.802461 25.29054 599.7055
## 13 1.999581 29.08278 312.4897
## 14 1.789389 22.43142 422.3670
## 15 2.007439 25.12936 330.8983
## 16 2.050786 20.26657 384.7733
## 17 2.241356 18.31848 246.4385
## 18 2.214826 18.86756 523.2094
## 19 2.043198 23.52826 499.4072
## 20 1.823680 22.15486 234.5522
## 21 2.218899 19.23311 440.9283
## 22 1.784586 22.58415 261.4689
## 23 1.787658 25.83040 400.6378
## 24 2.032558 26.43104 355.6507
## 25 2.135126 20.06216 247.8450
## 26 1.902208 22.26583 427.4451
## 27 2.167401 20.20288 313.7482
## 28 2.007897 17.87304 272.3421
## 29 2.293348 22.59399 216.8236
## 30 1.541930 17.86823 218.4869
tapply(population2$score, list(population2$item,
population2$scale), mean)
## 1 2 3
## 1 3.950501 32.84247 1067.0680
## 2 3.750543 47.59086 1033.9959
## 3 3.735677 42.51874 866.4659
## 4 3.988222 58.44071 987.2586
## 5 3.334382 41.48621 809.2516
## 6 4.158273 69.06516 947.8041
## 7 4.175987 53.80756 1134.0584
## 8 4.319914 39.56735 1018.3653
## 9 3.580827 40.26965 1152.9811
## 10 4.110682 36.59191 1075.4541
## 11 3.259170 49.49791 964.4849
## 12 3.768975 44.28268 1045.0263
## 13 4.254734 56.40228 1108.5536
## 14 3.929969 43.31388 914.6370
## 15 4.083672 56.00580 1140.3217
## 16 3.908679 32.36602 1199.3164
## 17 3.961111 70.97181 821.8387
## 18 4.106718 39.96760 1170.1621
## 19 4.325670 51.91558 916.6885
## 20 4.054181 45.07555 899.0236
## 21 4.059961 41.05976 1015.0842
## 22 4.227156 51.13586 1082.8396
## 23 4.048451 37.64497 1166.4801
## 24 4.116550 52.55213 944.5579
## 25 3.567186 58.03799 1057.1553
## 26 3.890774 49.23870 960.5072
## 27 4.290088 50.13873 853.1833
## 28 3.834475 50.28265 1083.9270
## 29 3.685025 60.31290 1197.7951
## 30 4.433935 57.31146 865.8817
tapply(population2$score, list(population2$item,
population2$scale), sd)
## 1 2 3
## 1 2.763139 35.581762 462.86038
## 2 1.390085 26.486240 448.23769
## 3 1.577165 11.730959 362.25854
## 4 2.255374 6.199381 270.24510
## 5 2.409540 34.898014 98.17839
## 6 1.674612 24.129790 74.10968
## 7 2.026715 11.583843 168.78975
## 8 1.820940 36.292835 475.80906
## 9 2.493294 31.252863 318.89486
## 10 2.016523 12.262949 357.21619
## 11 2.468095 18.159822 202.84176
## 12 1.551968 26.833445 344.64094
## 13 2.105696 10.203591 317.69026
## 14 1.528888 26.162484 65.89543
## 15 2.277393 24.108951 414.26184
## 16 1.646159 22.434883 326.78805
## 17 2.355252 45.120423 284.69863
## 18 1.533480 8.700002 423.75593
## 19 2.019509 32.359313 514.33388
## 20 1.924035 22.845239 303.14725
## 21 1.853170 23.675594 228.67923
## 22 1.998172 27.970055 466.10046
## 23 2.004897 18.415869 293.20638
## 24 2.744461 22.377501 436.79862
## 25 2.224172 16.478427 355.94647
## 26 1.384873 11.376008 285.61822
## 27 2.655162 36.746303 40.95487
## 28 1.693612 13.956762 286.35782
## 29 1.962601 34.851604 20.44412
## 30 1.769749 18.638239 309.10752
tapply(population3$score, list(population3$item,
population3$scale), mean)
## 1 2 3
## 1 3.855587 32.81676 1048.7199
## 2 3.868187 47.07626 1031.3172
## 3 3.845957 43.81290 863.0374
## 4 3.959157 57.44538 994.3256
## 5 3.584825 42.82264 845.7737
## 6 3.963773 69.50026 962.7873
## 7 4.107680 55.15474 1128.8592
## 8 4.215722 37.10898 1006.5998
## 9 3.702241 40.51058 1194.0925
## 10 3.993711 36.59024 1078.7180
## 11 3.269975 49.29718 972.6811
## 12 3.678456 44.17142 1067.3000
## 13 4.233037 56.61749 1131.7474
## 14 3.912706 43.10965 914.6692
## 15 4.054028 55.54440 1159.3063
## 16 3.958513 33.59518 1218.8248
## 17 3.997336 69.08474 827.5486
## 18 4.163447 38.55526 1150.8177
## 19 4.243718 51.62201 898.6121
## 20 4.011720 45.36838 911.8294
## 21 4.101582 41.40890 993.1776
## 22 4.183665 50.90172 1079.1560
## 23 4.171506 39.87135 1168.3530
## 24 4.020276 51.28601 952.6944
## 25 3.849307 58.74007 1075.0134
## 26 3.807178 49.43144 950.0585
## 27 4.115374 49.02862 858.4084
## 28 4.022901 49.87264 1103.1873
## 29 3.832931 59.77174 1206.0137
## 30 4.582119 57.30478 866.5154
tapply(population3$score, list(population3$item,
population3$scale), sd)
## 1 2 3
## 1 3.2769848 26.8364765 423.59264
## 2 2.7331542 0.1083774 169.37507
## 3 2.2973465 19.4175129 209.03329
## 4 2.3507781 41.4654945 129.99759
## 5 1.7898735 30.7513516 505.77952
## 6 1.9010021 10.0210657 282.49171
## 7 3.3702103 23.2926277 344.78506
## 8 1.9120117 12.4672783 439.83139
## 9 3.2527780 5.7468707 396.73471
## 10 2.0874351 10.1767588 698.84007
## 11 2.7977082 4.6360499 239.35434
## 12 3.3006312 12.0540315 458.16247
## 13 1.1770792 12.9508350 466.15400
## 14 2.3335247 18.0051530 430.97667
## 15 0.7675617 30.6424737 23.87155
## 16 1.5466065 31.4250583 323.38895
## 17 1.7805748 29.3918983 180.69885
## 18 2.6329110 30.5168016 495.70330
## 19 4.0269547 17.5234986 553.31890
## 20 0.8437944 30.3494229 326.23455
## 21 1.2479703 8.6152555 451.69516
## 22 2.2823698 29.3847960 510.45603
## 23 2.6252197 43.4371529 212.60874
## 24 5.1654318 18.5521068 342.31012
## 25 1.6935013 47.2660927 261.54093
## 26 1.4942078 20.2885886 131.62100
## 27 2.7674848 10.4767779 162.94589
## 28 2.8525696 8.2541909 164.82719
## 29 1.8855775 11.4233467 356.47995
## 30 1.8449062 26.5875137 396.29682
Samples. Each population was then sampled as if a researcher was conducting a pilot study. The sample sizes started at 20 participants per item increasing in units of 5 up to 100 participants.
# create pilot samples from 20 to 100
samples1 <- samples2 <- samples3 <- list()
sizes <- c(20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100)
for (i in 1:length(sizes)){
samples1[[i]] <- population1 %>%
group_by(item, scale) %>%
slice_sample(n = sizes[i])
samples2[[i]] <- population2 %>%
group_by(item, scale) %>%
slice_sample(n = sizes[i])
samples3[[i]] <- population3 %>%
group_by(item, scale) %>%
slice_sample(n = sizes[i])
}
AIPE Criterions. The standard errors of each item were calculated to mimic the AIPE procedure of finding an appropriately small confidence interval, as standard error functions as the main component in the formula for normal distribution confidence intervals. Standard errors were calculated at each decile of the items up to 90% (0% smallest SE, 10% …, 90% largest SE). The lower deciles would represent a strict criterion for accurate measurement, as many items would need smaller SEs to meet AIPE goals, while the higher deciles would represent less strict criterions for AIPE goals.
# calculate the SEs and the cutoff scores
SES1 <- SES2 <- SES3 <- list()
cutoffs1 <- cutoffs2 <- cutoffs3 <- list()
sd_items1 <- sd_items2 <- sd_items3 <- list()
for (i in 1:length(samples1)){
sd_items1[[i]] <- samples1[[i]] %>% group_by(item, scale) %>%
summarize(sd = sd(score), .groups = "keep") %>%
ungroup() %>% group_by(scale) %>% summarize(sd_item = sd(sd))
sd_items2[[i]] <- samples2[[i]] %>% group_by(item, scale) %>%
summarize(sd = sd(score), .groups = "keep") %>%
ungroup() %>% group_by(scale) %>% summarize(sd_item = sd(sd))
sd_items3[[i]] <- samples3[[i]] %>% group_by(item, scale) %>%
summarize(sd = sd(score), .groups = "keep") %>%
ungroup() %>% group_by(scale) %>% summarize(sd_item = sd(sd))
SES1[[i]] <- tapply(samples1[[i]]$score,
list(samples1[[i]]$item,
samples1[[i]]$scale),
function (x){ sd(x)/sqrt(length(x))})
SES2[[i]] <- tapply(samples2[[i]]$score,
list(samples2[[i]]$item,
samples2[[i]]$scale),
function (x){ sd(x)/sqrt(length(x))})
SES3[[i]] <- tapply(samples3[[i]]$score,
list(samples2[[i]]$item,
samples2[[i]]$scale),
function (x){ sd(x)/sqrt(length(x))})
cutoffs1[[i]] <- apply(as.data.frame(SES1[[i]]), 2,
quantile,
probs = seq(0, .9, by = .1))
cutoffs2[[i]] <- apply(as.data.frame(SES2[[i]]), 2,
quantile,
probs = seq(0, .9, by = .1))
cutoffs3[[i]] <- apply(as.data.frame(SES3[[i]]), 2,
quantile,
probs = seq(0, .9, by = .1))
}
In this section, we simulate what a researcher might do if they follow our suggested application of AIPE to sample size planning based on well measured items. Assuming each pilot sample represents a dataset a researcher has collected, we will simulate samples of 20 to 500 to determine what the new sample size suggestion would be. We assume that samples over 500 may be considered too large for many researchers who do not work in teams or have participant funds. The standard error of each item was calculated for each suggested sample size by pilot sample size by population type.
# sequence of sample sizes to try
samplesize_values <- seq(20, 500, 5)
# place to store everything
sampled_values1 <- sampled_values2 <- sampled_values3 <- list()
# loop over the samples
for (i in 1:length(samples1)){
# create a blank table for us to save the values in
sim_table1 <- matrix(NA,
nrow = length(samplesize_values),
ncol = 30*3)
# make it a data frame
sim_table1 <- sim_table2 <- sim_table3 <- as.data.frame(sim_table1)
# add a place for sample size values
sim_table1$sample_size <- sim_table2$sample_size <- sim_table3$sample_size <- NA
# loop over pilot sample sizes
for (q in 1:length(samplesize_values)){
# temp dataframe that samples and summarizes
temp <- samples1[[i]] %>%
group_by(item, scale) %>%
slice_sample(n = samplesize_values[q], replace = T) %>%
summarize(se = sd(score)/sqrt(length(score)),
.groups = "keep")
sim_table1[q, 1:90] <- temp$se
sim_table1[q, 91] <- samplesize_values[q]
temp <- samples2[[i]] %>%
group_by(item, scale) %>%
slice_sample(n = samplesize_values[q], replace = T) %>%
summarize(se = sd(score)/sqrt(length(score)),
.groups = "keep")
sim_table2[q, 1:90] <- temp$se
sim_table2[q, 91] <- samplesize_values[q]
temp <- samples3[[i]] %>%
group_by(item, scale) %>%
slice_sample(n = samplesize_values[q], replace = T) %>%
summarize(se = sd(score)/sqrt(length(score)),
.groups = "keep")
sim_table3[q, 1:90] <- temp$se
sim_table3[q, 91] <- samplesize_values[q]
} # end pilot sample loop
sampled_values1[[i]] <- sim_table1
sampled_values2[[i]] <- sim_table2
sampled_values3[[i]] <- sim_table3
} # end all sample loop
Next, the percent of items that fall below the cutoff scores, and thus, would be considered “well-measured” were calculated for each decile by sample.
summary_list1 <- summary_list2 <- summary_list3 <- list()
for (i in 1:length(sampled_values1)){
# summary list 1 ----
summary_list1[[i]] <- sampled_values1[[i]] %>%
pivot_longer(cols = -c(sample_size)) %>%
rename(item = name, se = value) %>%
mutate(scale = rep(1:3, 30*length(samplesize_values))) %>%
mutate(item = rep(rep(1:30, each = 3), length(samplesize_values)))
# cut offs for 1
temp1.1 <- summary_list1[[i]] %>%
filter(scale == "1") %>%
group_by(sample_size, scale) %>%
summarize(Percent_Below0 = sum(se <= cutoffs1[[i]][1, 1])/30,
Percent_Below10 = sum(se <= cutoffs1[[i]][2, 1])/30,
Percent_Below20 = sum(se <= cutoffs1[[i]][3, 1])/30,
Percent_Below30 = sum(se <= cutoffs1[[i]][4, 1])/30,
Percent_Below40 = sum(se <= cutoffs1[[i]][5, 1])/30,
Percent_Below50 = sum(se <= cutoffs1[[i]][6, 1])/30,
Percent_Below60 = sum(se <= cutoffs1[[i]][7, 1])/30,
Percent_Below70 = sum(se <= cutoffs1[[i]][8, 1])/30,
Percent_Below80 = sum(se <= cutoffs1[[i]][9, 1])/30,
Percent_Below90 = sum(se <= cutoffs1[[i]][10, 1])/30,
.groups = "keep") %>%
mutate(original_n = sizes[i],
source = "low")
temp1.2 <- summary_list1[[i]] %>%
filter(scale == "2") %>%
group_by(sample_size, scale) %>%
summarize(Percent_Below0 = sum(se <= cutoffs1[[i]][1, 2])/30,
Percent_Below10 = sum(se <= cutoffs1[[i]][2, 2])/30,
Percent_Below20 = sum(se <= cutoffs1[[i]][3, 2])/30,
Percent_Below30 = sum(se <= cutoffs1[[i]][4, 2])/30,
Percent_Below40 = sum(se <= cutoffs1[[i]][5, 2])/30,
Percent_Below50 = sum(se <= cutoffs1[[i]][6, 2])/30,
Percent_Below60 = sum(se <= cutoffs1[[i]][7, 2])/30,
Percent_Below70 = sum(se <= cutoffs1[[i]][8, 2])/30,
Percent_Below80 = sum(se <= cutoffs1[[i]][9, 2])/30,
Percent_Below90 = sum(se <= cutoffs1[[i]][10, 2])/30,
.groups = "keep") %>%
mutate(original_n = sizes[i],
source = "low")
temp1.3 <- summary_list1[[i]] %>%
filter(scale == "3") %>%
group_by(sample_size, scale) %>%
summarize(Percent_Below0 = sum(se <= cutoffs1[[i]][1, 3])/30,
Percent_Below10 = sum(se <= cutoffs1[[i]][2, 3])/30,
Percent_Below20 = sum(se <= cutoffs1[[i]][3, 3])/30,
Percent_Below30 = sum(se <= cutoffs1[[i]][4, 3])/30,
Percent_Below40 = sum(se <= cutoffs1[[i]][5, 3])/30,
Percent_Below50 = sum(se <= cutoffs1[[i]][6, 3])/30,
Percent_Below60 = sum(se <= cutoffs1[[i]][7, 3])/30,
Percent_Below70 = sum(se <= cutoffs1[[i]][8, 3])/30,
Percent_Below80 = sum(se <= cutoffs1[[i]][9, 3])/30,
Percent_Below90 = sum(se <= cutoffs1[[i]][10, 3])/30,
.groups = "keep") %>%
mutate(original_n = sizes[i],
source = "low")
#rejoin
summary_list1[[i]] <- bind_rows(temp1.1, temp1.2, temp1.3)
# summary list 2 ----
summary_list2[[i]] <- sampled_values2[[i]] %>%
pivot_longer(cols = -c(sample_size)) %>%
rename(item = name, se = value) %>%
mutate(scale = rep(1:3, 30*length(samplesize_values))) %>%
mutate(item = rep(rep(1:30, each = 3), length(samplesize_values)))
# cut offs for 2
temp2.1 <- summary_list2[[i]] %>%
filter(scale == "1") %>%
group_by(sample_size, scale) %>%
summarize(Percent_Below0 = sum(se <= cutoffs2[[i]][1, 1])/30,
Percent_Below10 = sum(se <= cutoffs2[[i]][2, 1])/30,
Percent_Below20 = sum(se <= cutoffs2[[i]][3, 1])/30,
Percent_Below30 = sum(se <= cutoffs2[[i]][4, 1])/30,
Percent_Below40 = sum(se <= cutoffs2[[i]][5, 1])/30,
Percent_Below50 = sum(se <= cutoffs2[[i]][6, 1])/30,
Percent_Below60 = sum(se <= cutoffs2[[i]][7, 1])/30,
Percent_Below70 = sum(se <= cutoffs2[[i]][8, 1])/30,
Percent_Below80 = sum(se <= cutoffs2[[i]][9, 1])/30,
Percent_Below90 = sum(se <= cutoffs2[[i]][10, 1])/30,
.groups = "keep") %>%
mutate(original_n = sizes[i],
source = "med")
temp2.2 <- summary_list2[[i]] %>%
filter(scale == "2") %>%
group_by(sample_size, scale) %>%
summarize(Percent_Below0 = sum(se <= cutoffs2[[i]][1, 2])/30,
Percent_Below10 = sum(se <= cutoffs2[[i]][2, 2])/30,
Percent_Below20 = sum(se <= cutoffs2[[i]][3, 2])/30,
Percent_Below30 = sum(se <= cutoffs2[[i]][4, 2])/30,
Percent_Below40 = sum(se <= cutoffs2[[i]][5, 2])/30,
Percent_Below50 = sum(se <= cutoffs2[[i]][6, 2])/30,
Percent_Below60 = sum(se <= cutoffs2[[i]][7, 2])/30,
Percent_Below70 = sum(se <= cutoffs2[[i]][8, 2])/30,
Percent_Below80 = sum(se <= cutoffs2[[i]][9, 2])/30,
Percent_Below90 = sum(se <= cutoffs2[[i]][10, 2])/30,
.groups = "keep") %>%
mutate(original_n = sizes[i],
source = "med")
temp2.3 <- summary_list2[[i]] %>%
filter(scale == "3") %>%
group_by(sample_size, scale) %>%
summarize(Percent_Below0 = sum(se <= cutoffs2[[i]][1, 3])/30,
Percent_Below10 = sum(se <= cutoffs2[[i]][2, 3])/30,
Percent_Below20 = sum(se <= cutoffs2[[i]][3, 3])/30,
Percent_Below30 = sum(se <= cutoffs2[[i]][4, 3])/30,
Percent_Below40 = sum(se <= cutoffs2[[i]][5, 3])/30,
Percent_Below50 = sum(se <= cutoffs2[[i]][6, 3])/30,
Percent_Below60 = sum(se <= cutoffs2[[i]][7, 3])/30,
Percent_Below70 = sum(se <= cutoffs2[[i]][8, 3])/30,
Percent_Below80 = sum(se <= cutoffs2[[i]][9, 3])/30,
Percent_Below90 = sum(se <= cutoffs2[[i]][10, 3])/30,
.groups = "keep") %>%
mutate(original_n = sizes[i],
source = "med")
#rejoin
summary_list2[[i]] <- bind_rows(temp2.1, temp2.2, temp2.3)
# summary list 3 ----
summary_list3[[i]] <- sampled_values3[[i]] %>%
pivot_longer(cols = -c(sample_size)) %>%
rename(item = name, se = value) %>%
mutate(scale = rep(1:3, 30*length(samplesize_values))) %>%
mutate(item = rep(rep(1:30, each = 3), length(samplesize_values)))
# cut offs for 3
temp3.1 <- summary_list3[[i]] %>%
filter(scale == "1") %>%
group_by(sample_size, scale) %>%
summarize(Percent_Below0 = sum(se <= cutoffs3[[i]][1, 1])/30,
Percent_Below10 = sum(se <= cutoffs3[[i]][2, 1])/30,
Percent_Below20 = sum(se <= cutoffs3[[i]][3, 1])/30,
Percent_Below30 = sum(se <= cutoffs3[[i]][4, 1])/30,
Percent_Below40 = sum(se <= cutoffs3[[i]][5, 1])/30,
Percent_Below50 = sum(se <= cutoffs3[[i]][6, 1])/30,
Percent_Below60 = sum(se <= cutoffs3[[i]][7, 1])/30,
Percent_Below70 = sum(se <= cutoffs3[[i]][8, 1])/30,
Percent_Below80 = sum(se <= cutoffs3[[i]][9, 1])/30,
Percent_Below90 = sum(se <= cutoffs3[[i]][10, 1])/30,
.groups = "keep") %>%
mutate(original_n = sizes[i],
source = "high")
temp3.2 <- summary_list3[[i]] %>%
filter(scale == "2") %>%
group_by(sample_size, scale) %>%
summarize(Percent_Below0 = sum(se <= cutoffs3[[i]][1, 2])/30,
Percent_Below10 = sum(se <= cutoffs3[[i]][2, 2])/30,
Percent_Below20 = sum(se <= cutoffs3[[i]][3, 2])/30,
Percent_Below30 = sum(se <= cutoffs3[[i]][4, 2])/30,
Percent_Below40 = sum(se <= cutoffs3[[i]][5, 2])/30,
Percent_Below50 = sum(se <= cutoffs3[[i]][6, 2])/30,
Percent_Below60 = sum(se <= cutoffs3[[i]][7, 2])/30,
Percent_Below70 = sum(se <= cutoffs3[[i]][8, 2])/30,
Percent_Below80 = sum(se <= cutoffs3[[i]][9, 2])/30,
Percent_Below90 = sum(se <= cutoffs3[[i]][10, 2])/30,
.groups = "keep") %>%
mutate(original_n = sizes[i],
source = "high")
temp3.3 <- summary_list3[[i]] %>%
filter(scale == "3") %>%
group_by(sample_size, scale) %>%
summarize(Percent_Below0 = sum(se <= cutoffs3[[i]][1, 3])/30,
Percent_Below10 = sum(se <= cutoffs3[[i]][2, 3])/30,
Percent_Below20 = sum(se <= cutoffs3[[i]][3, 3])/30,
Percent_Below30 = sum(se <= cutoffs3[[i]][4, 3])/30,
Percent_Below40 = sum(se <= cutoffs3[[i]][5, 3])/30,
Percent_Below50 = sum(se <= cutoffs3[[i]][6, 3])/30,
Percent_Below60 = sum(se <= cutoffs3[[i]][7, 3])/30,
Percent_Below70 = sum(se <= cutoffs3[[i]][8, 3])/30,
Percent_Below80 = sum(se <= cutoffs3[[i]][9, 3])/30,
Percent_Below90 = sum(se <= cutoffs3[[i]][10, 3])/30,
.groups = "keep") %>%
mutate(original_n = sizes[i],
source = "high")
#rejoin
summary_list3[[i]] <- bind_rows(temp3.1, temp3.2, temp3.3)
}
summary_DF <- bind_rows(summary_list1,
summary_list2,
summary_list3)
From this data, we pinpoint the smallest suggested sample size at which 80%, 85%, 90%, and 95% of the items fall below the cutoff criterion. These values were chosen as popular measures of “power” in which one could determine the minimum suggested sample size (potentially 80% of items) and the maximum suggested sample size (potentially 90%).
summary_long_80 <- summary_DF %>%
pivot_longer(cols = -c(sample_size, original_n, source, scale)) %>%
filter(value >= .80) %>%
arrange(sample_size, original_n, source, scale, name) %>%
group_by(original_n, name, source, scale) %>%
slice_head(n = 1) %>%
mutate(power = 80)
summary_long_85 <- summary_DF %>%
pivot_longer(cols = -c(sample_size, original_n, source, scale)) %>%
filter(value >= .85) %>%
arrange(sample_size, original_n, source, scale, name) %>%
group_by(original_n, name, source, scale) %>%
slice_head(n = 1) %>%
mutate(power = 85)
summary_long_90 <- summary_DF %>%
pivot_longer(cols = -c(sample_size, original_n, source, scale)) %>%
filter(value >= .90) %>%
arrange(sample_size, original_n, source, scale, name) %>%
group_by(original_n, name, source, scale) %>%
slice_head(n = 1) %>%
mutate(power = 90)
summary_long_95 <- summary_DF %>%
pivot_longer(cols = -c(sample_size, original_n, source, scale)) %>%
filter(value >= .95) %>%
arrange(sample_size, original_n, source, scale, name) %>%
group_by(original_n, name, source, scale) %>%
slice_head(n = 1) %>%
mutate(power = 95)
summary_long <- rbind(summary_long_80,
summary_long_85,
summary_long_90,
summary_long_95)
summary_long$source <- factor(summary_long$source,
levels = c("low", "med", "high"),
labels = c("Low Variance", "Medium Variance", "High Variance"))
summary_long$scale2 <- factor(summary_long$scale,
levels = c(1:3),
labels = c("Small Scale", "Medium Scale", "Large Scale"))
sd_items <- bind_rows(sd_items1, sd_items2, sd_items3)
sd_items$original_n <- rep(rep(sizes, each = 3), 3)
sd_items$source <- rep(c("Low Variance", "Medium Variance", "High Variance"), each = 3*length(sizes))
summary_long <- summary_long %>%
full_join(sd_items,
by = c("original_n" = "original_n", "scale" = "scale",
"source" = "source"))
summary_long$name <- gsub("Percent_Below", "Decile ", summary_long$name)
We examined if this procedure is sensitive to differences in item heterogeneity, as we should expect to collect larger samples if we wish to have a large number of items reach a threshold of acceptable variance; potentially, assuring we could average them if a researcher did not wish to use a more complex analysis such as multilevel modeling.
The figure below illustrates the potential minimum sample size for 80% of items to achieve a desired cutoff score. The black dots denote the original sample size against the suggested sample size. By comparing the facets, we can determine that our suggested procedure does capture the differences in heterogeneity. As heterogeneity increases in item variances, the proposed sample size also increases, especially at stricter cutoffs. Missing cutoff points where sample sizes proposed would be higher than 500.
summary_long$source <- factor(summary_long$source,
levels = c("Low Variance", "Medium Variance", "High Variance"))
ggplot(summary_long %>% filter(power == 80), aes(original_n, sample_size, color = name)) +
geom_point() +
geom_point(aes(original_n, original_n), color = "black") +
geom_line() +
theme_classic() +
facet_wrap(~ source*scale2) +
xlab("Original Sample Size") +
ylab("Suggested Sample Size") +
scale_color_discrete(name = "Cutoff Score")
In our second question, we examined if the suggested procedure was sensitive to the amount of information present in the pilot data. Larger pilot data is more informative, and therefore, we should expect a lower projected sample size. As shown in the figure below for only the low variability and small scale data, we do not find this effect. These simplistic simulations from the pilot data would nearly always suggest a larger sample size - mostly in a linear trend increasing with sample sizes. This result comes from the nature of the procedure - if we base our estimates on some SE cutoff, we will almost always need a bit more people for items to meet those goals. This result does not achieve our second goal.
ggplot(summary_long %>% filter(source == "Low Variance") %>% filter(scale == 1), aes(original_n, sample_size, color = name)) +
geom_point() +
geom_point(aes(original_n, original_n), color = "black") +
geom_line() +
theme_classic() +
facet_wrap(~ power) +
xlab("Original Sample Size") +
ylab("Suggested Sample Size") +
scale_color_discrete(name = "Cutoff Score")
Therefore, we suggest using a correction factor on the simulation procedure to account for the known asymptotic nature of power (i.e., at larger sample sizes power increases level off). For this function, we combined a correction factor for upward biasing of effect sizes (Hedges’ correction) with the formula for exponential decay calculations. The decay factor is calculated as follows:
\[ 1 - \sqrt{\frac{N_{pilot} - min(N_{sim})}{N_{pilot}}}^{log_2(N_{pilot})}\] \(N_{pilot}\) indicates the sample size of the pilot data minus the minimum sample size for simulation to ensure that the smallest sample sizes do not decay (e.g., the formula zeroes out). This value is raised to the power of \(log_2\) of the sample size of the pilot data, which decreases the impact of the decay to smaller increments for increasing sample sizes. This value is then multiplied by the proposed sample size. As show in the figure below, this correction factor produces the desired quality of maintaining that small pilot studies should increase sample size, and that sample size suggestions level off as pilot study data sample size increases.
decay <- 1-sqrt((summary_long$original_n-20)/summary_long$original_n)^log2(summary_long$original_n)
summary_long$new_sample <- summary_long$sample_size*decay
ggplot(summary_long %>% filter(source == "Low Variance") %>% filter(scale == 1), aes(original_n, new_sample, color = name)) +
geom_point() +
geom_point(aes(original_n, original_n), color = "black") +
geom_line() +
theme_classic() +
facet_wrap(~ power) +
xlab("Original Sample Size") +
ylab("Suggested Sample Size") +
scale_color_discrete(name = "Cutoff Score")
We have portrayed that this procedure, with a correction factor, can perform as desired. However, within real scenarios, researchers will only have one pilot sample, not the various simulated samples shown above. What should the researcher do to correct their sample size on their own pilot data?
To explore if we could recover the new suggested sample size from data a researcher would have, we used linear models to create a formula for calculation. First, the corrected sample size was predicted by the original suggested sample size. Next, the standard deviation of the item standard deviations was added to the equation to recreate heterogeneity estimates. Last, we included the pilot sample size. The scale of the data is embedded into the standard deviation of the items (r = 0.88), and therefore, this variable was not included separately. The standard deviation of the item’s standard deviation embeds both heterogeneity and potential scale size.
user_model <- lm(new_sample ~ sample_size, data = summary_long)
user_print <- apa_print(user_model)
user_model2 <- lm(new_sample ~ sample_size + sd_item, data = summary_long)
user_print2 <- apa_print(user_model2)
user_model3 <- lm(new_sample ~ sample_size + sd_item + original_n, data = summary_long)
user_print3 <- apa_print(user_model3)
change_table <- tidy(anova(user_model, user_model2, user_model3))
The first model using original sample size to predict new sample size was significant, \(R^2 = .89\), 90% CI \([0.88, 0.89]\), \(F(1, 5,347) = 42,734.67\), \(p < .001\), capturing nearly 90% of the variance, \(b = 0.64\), 95% CI \([0.63, 0.64]\). The second model with item standard deviation was better then the first model F(1, 5346) = 23.9846685, p < .001, \(R^2 = .89\), 90% CI \([0.88, 0.89]\). The item standard deviation predictor was significant, \(b = 0.01\), 95% CI \([0.00, 0.02]\), \(t(5346) = 2.98\), \(p = .003\). The addition of the original pilot sample size was also significant, F(1, 5345) = 9054.721618, p < .001, \(R^2 = .96\), 90% CI \([0.96, 0.96]\).
As shown in the table below for our final model, the new suggested sample size is proportional to the original suggested sample size (i.e., b < 1), which reduces the sample size suggestion. As variability increases, the suggested sample size also increases to capture differences in heterogeneity shown above; however, this predictor is not significant in the final model, and only contributes a small portion of overall variance. Last, in order to correct for large pilot data, the original pilot sample size decreases the new suggested sample size. This formula approximation captures 96% of the variance in sample size scores and should allow a researcher to estimate based on their own data.
apa_table(tidy(user_model3))
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 41.26 | 0.49 | 84.40 | 0.00 |
| sample_size | 0.71 | 0.00 | 348.22 | 0.00 |
| sd_item | 0.00 | 0.00 | 0.48 | 0.63 |
| original_n | -0.75 | 0.01 | -95.16 | 0.00 |
by_cutoff <- list()
R2 <- list()
for (cutoff in unique(summary_long$name)){
by_cutoff[[cutoff]] <- lm(new_sample ~ sample_size + sd_item + original_n, data = summary_long %>% filter(name == cutoff))
R2[cutoff] <- summary(by_cutoff[[cutoff]])$r.squared
}
R2
## $`Decile 0`
## [1] 0.9538654
##
## $`Decile 10`
## [1] 0.9625936
##
## $`Decile 20`
## [1] 0.9659032
##
## $`Decile 30`
## [1] 0.9604812
##
## $`Decile 40`
## [1] 0.9642438
##
## $`Decile 50`
## [1] 0.9676218
##
## $`Decile 60`
## [1] 0.9670134
##
## $`Decile 70`
## [1] 0.964693
##
## $`Decile 80`
## [1] 0.9644982
##
## $`Decile 90`
## [1] 0.968693
Last, we examine the question of an appropriate SE decile. All graphs for power, heterogeneity, scale, and correction are presented below. First, the 0%, 10%, and 20% deciles are likely too restrictive, providing very large estimates that do not always find a reasonable sample size in proportion to the pilot sample size, scale, and heterogeneity.If we examine the \(R^2\) values for each decile of our regression equation separately, we find that the 50% (0.968) represents the best match to our corrected sample size suggestions. The 50% decile, in the corrected format, appears to meet all goals: 1) increases with heterogeneity and scale of data, and 2) higher suggested values for small original samples and a leveling effect at larger pilot data.
The formula for finding the corrected sample size using a 50% decile is: \(Final Sample = 0.637 \times X_{Suggested Sample Size} + 0.002 \times X_{SD Items} + -0.546 \times X_{Pilot Sample Size}\).
ggplot(summary_long %>% filter(source == "Low Variance"), aes(original_n, sample_size, color = name)) +
geom_point() +
geom_point(aes(original_n, original_n), color = "black") +
geom_line() +
theme_classic() +
facet_wrap(~ scale2*power) +
xlab("Original Sample Size") +
ylab("Suggested Sample Size") +
scale_color_discrete(name = "Cutoff Score")
ggplot(summary_long %>% filter(source == "Low Variance"), aes(original_n, new_sample, color = name)) +
geom_point() +
geom_point(aes(original_n, original_n), color = "black") +
geom_line() +
theme_classic() +
facet_wrap(~ scale2*power) +
xlab("Original Sample Size") +
ylab("Suggested Sample Size") +
scale_color_discrete(name = "Cutoff Score")
ggplot(summary_long %>% filter(source == "Medium Variance"), aes(original_n, sample_size, color = name)) +
geom_point() +
geom_point(aes(original_n, original_n), color = "black") +
geom_line() +
theme_classic() +
facet_wrap(~ scale2*power) +
xlab("Original Sample Size") +
ylab("Suggested Sample Size") +
scale_color_discrete(name = "Cutoff Score")
summary_long$new_sample <- summary_long$sample_size*decay
ggplot(summary_long %>% filter(source == "Medium Variance"), aes(original_n, new_sample, color = name)) +
geom_point() +
geom_point(aes(original_n, original_n), color = "black") +
geom_line() +
theme_classic() +
facet_wrap(~ scale2*power) +
xlab("Original Sample Size") +
ylab("Suggested Sample Size") +
scale_color_discrete(name = "Cutoff Score")
ggplot(summary_long %>% filter(source == "High Variance"), aes(original_n, sample_size, color = name)) +
geom_point() +
geom_point(aes(original_n, original_n), color = "black") +
geom_line() +
theme_classic() +
facet_wrap(~ scale2*power) +
xlab("Original Sample Size") +
ylab("Suggested Sample Size") +
scale_color_discrete(name = "Cutoff Score")
ggplot(summary_long %>% filter(source == "High Variance"), aes(original_n, new_sample, color = name)) +
geom_point() +
geom_point(aes(original_n, original_n), color = "black") +
geom_line() +
theme_classic() +
facet_wrap(~ scale2*power) +
xlab("Original Sample Size") +
ylab("Suggested Sample Size") +
scale_color_discrete(name = "Cutoff Score")